In this dataset, were are going to be exploring sales information provided.
In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
Let us start by importing some helper libraries and the dataset as well
In [2]:
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from brewer2mpl import qualitative
In [3]:
df= pd.read_csv("../input/Video_Games_Sales_as_at_22_Dec_2016.csv")
df.shape
Out[3]:
Now that we know the shape of the dataset, let's have a peek at the data and try to find if they are any missing values.
In [4]:
df.head()
Out[4]:
The cell below shows the data type of the columns
In [5]:
df.info()
The cell below shows the column name and the number of empty rows.
In [6]:
df.isnull().sum()
Out[6]:
A list of all the columns in the dataset
In [9]:
df.columns.tolist()
Out[9]:
Calculating the % of missing values
In [10]:
df_na= ( df.isnull().sum() / len(df) ) * 100
df_na= df_na.drop(df_na[df_na == 0].index).sort_values(ascending= False)
In [11]:
f, ax= plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=df_na.index, y=df_na.values)
ax.set(title='Missing Values Plot', ylabel='% Missing')
Out[11]:
Unique Gaming platforms
In [13]:
df.Platform.unique()
Out[13]:
In [14]:
#df.Platform.value_counts()
ssc = df.Platform.value_counts()
f, ax= plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=ssc.values, y=ssc.index, orient='h')
ax.set(title='Consoles by count', ylabel='Count')
f.tight_layout()
dropping all NA values
In [15]:
df_clean= df.dropna(axis=0)
df_clean.shape
Out[15]:
In [16]:
ssc = df_clean.Platform.value_counts()
f, ax= plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=ssc.values, y=ssc.index, orient='h')
ax.set(title='Consoles by count after dropping NAs', ylabel='Count')
f.tight_layout()
In [17]:
#df['User_Score']= df['User_Score'].convert_objects(convert_numeric=True)
df_clean.User_Score= df_clean.User_Score.astype('float')
#df.User_Score.dtype
#df['User_Score'] = video['User_Score'].convert_objects(convert_numeric= True)
Plot of the user score v the critic score of games. It appears the users and the critics agree on games with score greater than 8.
In [18]:
sns.jointplot(x='User_Score', y='Critic_Score', data=df_clean, kind='hex', cmap='coolwarm', size=7)
Out[18]:
Critic score v critic count. From this plot, we observe that few critics give scores above 80.
In [19]:
sns.jointplot(x='Critic_Score', y='Critic_Count', data=df_clean, kind='hex', cmap='plasma', size=7)
Out[19]:
CORRELATION BETWEEN COLUMNS
In [22]:
stats=['Year_of_Release','NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales',
'Global_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count',
'Rating']
corrmat = df_clean[stats].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
plt.title('correlation between columns')
sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)
Out[22]:
Taking a look at Playstation
In [25]:
play= df_clean[(df_clean['Platform']== 'PS2') | (df_clean['Platform']== 'PS3')
| (df_clean['Platform']== 'PS')| (df_clean['Platform']== 'PS4')]
play.shape
Out[25]:
Playststation Global 1994-2016
In [26]:
sales_Play= play.groupby(['Year_of_Release', 'Platform'])['Global_Sales'].sum()
sales_Play.unstack().plot(kind='bar',stacked=True, colormap= 'Oranges', grid=False)
ax.set(title='Playststation Global over the year', ylabel='Cumulative count')
Out[26]:
Top selling genre for Playstation
In [28]:
sales_Play= play.groupby(['Genre', 'Platform'])['Global_Sales'].sum()
sales_Play.unstack().plot(kind='bar',stacked=True, colormap= 'Oranges', grid=False)
Out[28]:
Rating of the games made
In [29]:
sales_Play= play.groupby(['Rating', 'Platform'])['Global_Sales'].sum()
sales_Play.unstack().plot(kind='bar',stacked=True, colormap= 'Oranges', grid=False)
Out[29]:
Taking a closer look at Xbox
In [30]:
xb= df_clean[(df_clean['Platform']== 'X360') | (df_clean['Platform']== 'XOne')
| (df_clean['Platform']== 'XB')]
xb.shape
Out[30]:
Global sales of the Xbox consoles globally 1994-2016
In [32]:
sales_xb= xb.groupby(['Year_of_Release', 'Platform'])['Global_Sales'].sum()
sales_xb.unstack().plot(kind='bar',stacked=True, colormap= 'Vega20', grid=False)
Out[32]:
Top selling genre per Xbox console. The top selling genre is the shooter, which makes sense because of the halo franchise.
In [33]:
sales_xb= xb.groupby(['Genre', 'Platform'])['Global_Sales'].sum()
sales_xb.unstack().plot(kind='bar',stacked=True, colormap= 'Vega20', grid=False)
Out[33]:
Rating and global sales
In [35]:
sales_xb= xb.groupby(['Rating', 'Platform'])['Global_Sales'].sum()
sales_xb.unstack().plot(kind='bar',stacked=True, colormap= 'Vega20', grid=False)
Out[35]:
Taking a closer look at nintendo
In [36]:
nintendo= df_clean[(df_clean['Platform']== 'DS') | (df_clean['Platform']== 'Wii')
| (df_clean['Platform']== 'GC')| (df_clean['Platform']== 'GBA')
|(df_clean['Platform']== '3DS') | (df_clean['Platform']== 'WiiU')]
nintendo.shape
Out[36]:
Platform and total global sales from 1994-2016
In [37]:
nintendo_sales= nintendo.groupby(['Year_of_Release', 'Platform'])['Global_Sales'].sum()
nintendo_sales.unstack().plot(kind='bar',stacked=True, colormap= 'Set1', grid=False)
Out[37]:
Genre and total sales on platform. Nintendo looks to be selling alot of sports oriented games, especially on the Wii. However the Wii U is struggling in sales.
In [38]:
nintendo_sales= nintendo.groupby(['Genre', 'Platform'])['Global_Sales'].sum()
nintendo_sales.unstack().plot(kind='bar',stacked=True, colormap= 'Set1', grid=False)
Out[38]:
Rating and total global sales. Nintendo sold majorly in the category of E (everyone)
In [39]:
nintendo_sales= nintendo.groupby(['Rating', 'Platform'])['Global_Sales'].sum()
nintendo_sales.unstack().plot(kind='bar',stacked=True, colormap= 'Set1', grid=False)
Out[39]:
In [52]:
current_gen= df_clean[(df_clean['Platform']== 'Wii') | (df_clean['Platform']== 'X360') |
(df_clean['Platform']== 'PS3')]
current_gen.shape
Out[52]:
Comparing the top selling platforms, last generation
In [54]:
current_gen_sales= current_gen.groupby(['Year_of_Release', 'Platform'])['Global_Sales'].sum()
current_gen_sales.unstack().plot(kind='bar',stacked=True, colormap= 'Set2', grid=False)
Out[54]:
In [55]:
current_gen_sales= current_gen.groupby(['Genre', 'Platform'])['Global_Sales'].sum()
current_gen_sales.unstack().plot(kind='bar',stacked=True, colormap= 'Set2', grid=False)
Out[55]:
In [56]:
current_gen_sales= current_gen.groupby(['Rating', 'Platform'])['Global_Sales'].sum()
current_gen_sales.unstack().plot(kind='bar',stacked=True, colormap= 'Set2', grid=False)
Out[56]:
last generation sales from North America
In [44]:
current_gen_sales= current_gen.groupby(['Year_of_Release', 'Platform'])['NA_Sales'].sum()
current_gen_sales.unstack().plot(kind='bar',stacked=True, colormap= 'Blues', grid=False)
Out[44]:
Last generation sales from Japan
In [45]:
current_gen_sales= current_gen.groupby(['Year_of_Release', 'Platform'])['JP_Sales'].sum()
current_gen_sales.unstack().plot(kind='bar',stacked=True, colormap= 'Blues', grid=False)
Out[45]:
Last generation sales from EU
In [46]:
current_gen_sales= current_gen.groupby(['Year_of_Release', 'Platform'])['EU_Sales'].sum()
current_gen_sales.unstack().plot(kind='bar',stacked=True, colormap= 'Blues', grid=False)
Out[46]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: